#!/usr/bin/python

import sys,os,re,optparse,shutil,glob
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
import signal
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
from matplotlib import patches
from pylab import *
from scipy.stats.stats import trim1
from multiprocessing import Pool
from scipy.ndimage import measurements,interpolation
from scipy.misc import imsave
import traceback

from ocrolib import number_of_processors
import ocrolib

parser = optparse.OptionParser(usage="""
%prog [options] image1.png image2.png ...

Usually, you would use an argument pattern like: book/????/??????.png

Computes page segmentations and extracts text lines.
For each input image image.png, it generates:

* image.pseg.png -- page segmentation
* image/010001.png -- gray scale text line image column 1, line 1
* image/010001.bin.png -- bineary text line image column 1, line 1

Use the -d or -D argument to verify that the layout analysis is working
correctly.

If image.bin.png exists, it uses it.  If not, it uses built-in
preprocessing and generates and writes its own binary version.
(The original gray scale image is, however, not altered, so it
may be rotated relative to the binary image.)

If an image.tiseg.png file exists, it uses it to constrain the
layout analysis (although some layout analysis methods may be
ignoring the map).
""")
parser.add_option("-g","--gray",action="store_true",
                  help="output grayscale images + binary masks")
parser.add_option("-u","--upscale",type=int,default=32,
                  help="upscale lines shorter than this to the given target height")
parser.add_option("-t","--target",type=int,default=64,
                  help="downscale lines taller than this to the given target height")
parser.add_option("-L","--low",type=int,default=16,
                  help="lower limit for text line height")
parser.add_option("-H","--high",type=int,default=200,
                  help="upper limit for text line height")
parser.add_option("-W","--width",type=int,default=20,
                  help="lower limit for text line width")
parser.add_option("-v","--verbose",action="store_true",
                  help="output additional information")
parser.add_option("-p","--pad",default=1,type=int,
                  help="pad lines by this amount")
parser.add_option("-d","--display",action="store_true",
                  help="display result")
parser.add_option("-D","--Display",action="store_true",
                  help="display continuously")
parser.add_option("-S","--segmenter",default="ocrorast.SegmentPageByRAST",
                  help="which segmentation component to use")
parser.add_option("-P","--preproc",default="ocropreproc.CommonPreprocessing",
                  help="which preprocessing component to use")
parser.add_option("-r","--dpi",default=300,type=int,
                  help="resolution of input image in DPI")
parser.add_option("-q","--silent",action="store_true",
                  help="disable warnings")
parser.add_option("-b","--blackout",action="store_false",default=True,
                  help="use blackout for image regions (instead of passing rectangles)")
parser.add_option("-R","--descender",type=float,default=-1,
                  help="maximum descender")
parser.add_option("-Q","--parallel",type=int,default=number_of_processors(),
                  help="number of parallel processes to use")
options,args = parser.parse_args()

if len(args)==1 and os.path.isdir(args[0]):
    args = sorted(glob.glob(args[0]+"/????.png"))

preproc = ocrolib.make_IBinarize(options.preproc)

segmenter = ocrolib.make_ISegmentPage(options.segmenter)

if options.descender>0:
    segmenter.pset("max_descender",options.descender)

if options.Display: options.display = 1
if options.display: ion()

def process_arg(arg):
    print "===",arg
    base,_ = ocrolib.allsplitext(arg)
    if os.path.exists(base):
        print "# output directory",base,"already exists"
        return

    image = ocrolib.read_image_gray(arg,'B')
    h,w = image.shape

    if options.display:
        clf(); imshow(image,cmap=cm.gray); draw(); ginput(1,1)

    # get a binary image, either saved on disk or through
    # preprocessing
    if os.path.exists(base+".bin.png"):
        print "# loading",base+".bin.png"
        page_gray = image
        page_bin = ocrolib.read_image_gray(base+".bin.png")
    else:
        print "# binarizing"
        (page_bin,page_gray) = preproc.binarize(image)
        ocrolib.write_image_gray(base+".bin.png",page_bin)

    # get a text/image segmentation if available
    page_ti = None
    tirects = None
    if os.path.exists(base+".tiseg.png"):
        # FIXME these aren't implemented
        # print "# loading",base+".tiseg.png"
        # rgb = ocrolib.read_image_packed(page_ti,base+".tiseg.png")
        # ocrolib.unpack_rgb(r,g,b,page_ti)
        # n = ocrolib.label_components(components)
        # print "# number of image regions",n
        # ocrolib.bounding_boxes(tirects,components)
        raise Exception("tiseg handling unimplemented right now")

    if options.blackout and tirects is not None:
        for (r0,c0,r1,c1) in tirects:
            page_bin[r0:r1,c0:c1] = 0

    if options.display:
        clf(); imshow(page_bin,cmap=cm.gray); draw(); ginput(1,1)

    print "# segmenting"
    if options.blackout:
        page_seg = segmenter.segment(page_bin)
    else:
        page_seg = segmenter.segment(page_bin,tirects)
    regions = ocrolib.RegionExtractor()
    regions.setPageLines(page_seg)
    if os.path.exists(base):
        print "# removing",base
        shutil.rmtree(base)

    os.mkdir(base)

    nregions = regions.length()
    print "# writing",nregions,"lines"
    if options.display:
        clf()
        axis = subplot(111)
        axis.imshow(page_bin,cmap=cm.gray)
        ocrolib.draw_pseg(page_seg,axis)

    for i in range(1,regions.length()):
        line = regions.extractMasked(page_bin,i,0,255,options.pad)
        if line.shape[0]<options.low:
            if options.verbose: print "# skipping %06x"%regions.id(i),", not tall enough:",line.shape[0]
            continue
        if line.shape[0]>options.high: 
            if options.verbose: print "# skipping %06x"%regions.id(i),", too tall:",line.shape[0]
            continue
        if line.shape[1]<options.width:
            if options.verbose: print "# skipping %06x"%regions.id(i),", not wide enough:",line.shape[1]
            continue
        if options.upscale>0 and line.shape[0]<options.upscale:
            scale = options.upscale*1.0/line.shape[0]
            if options.verbose:
                print "# upscaling %06x"%regions.id(i),"by",scale
            line = interpolation.zoom(line,(scale,scale),order=1)
            assert abs(line.shape[0]-options.upscale)<2,line.shape
        elif options.target>0 and line.shape[0]>options.target:
            scale = options.target*1.0/line.shape[0]
            if options.verbose:
                print "# downscaling %06x"%regions.id(i),"by",scale
            line = interpolation.zoom(line,(scale,scale),order=1)
            assert abs(line.shape[0]-options.target)<2,line.shape
        if not options.silent:
            if ocrolib.quick_check_line_components(line,dpi=options.dpi)<0.5:
                continue
        assert (regions.id(i)&0xff0000)>0
        if options.gray:
            ocrolib.write_image_gray("%s/%06x.bin.png"%(base,regions.id(i)),line)
            line = regions.extract(page_gray,i,options.pad)
            ocrolib.write_image_gray("%s/%06x.png"%(base,regions.id(i)),line)
        else:
            ocrolib.write_image_gray("%s/%06x.bin.png"%(base,regions.id(i)),line)
            ocrolib.write_image_gray("%s/%06x.png"%(base,regions.id(i)),line)
    ocrolib.write_page_segmentation(base+".pseg.png",page_seg)
    if options.display:
        draw()
        if not options.Display: 
            raw_input("hit ENTER to continue")
        else:
            ginput(1,timeout=1)

def process_arg_safe(arg):
    try:
        process_arg(arg)
    except:
        traceback.print_exc()
        print "# OOPS",arg,"failed"

if options.parallel<2:
    for arg in args:
        process_arg_safe(arg)
else:
    pool = Pool(processes=options.parallel)
    jobs = args
    result = pool.map(process_arg_safe,jobs)

